Introduction

In this section, we ask the question: “How does GDP per capita relate to selected quality of life and human-geographic metrics?”

The Human-Geographic information we have is:

[Info here]

The Quality of Life indicators are:

[Info here]

We begin the analysis by reading in the data (and filtering the relevant columns) as is done in the following R code.

# Function to display the columns and nan counts
col_names_and_nas <- function(df) {
  for (i in seq_along(df)) {
    cat(colnames(df)[i], ":", sum(is.na(df[[i]])), "\n")
  }
}

# read data
full_df <- read.csv("../data/clean_data.csv", header = TRUE)

# Trim data
cols_to_keep <- c("Geography..Map.references", "Geography..Geographic.coordinates", "Geography..Area...land", "Geography..Area...total", "Geography..Climate", "Geography..Land.use...agricultural.land", "Economy..Real.GDP.per.capita", "People.and.Society..Population...total", "People.and.Society..Population.growth.rate", "People.and.Society..Birth.rate", "People.and.Society..Death.rate", "People.and.Society..Maternal.mortality.ratio", "People.and.Society..Infant.mortality.rate...total", "People.and.Society..Life.expectancy.at.birth...total.population" ,"People.and.Society..Physician.density" ,"Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants", "Communications..Internet.users...percent.of.population")

trim_df <- full_df[,cols_to_keep]
trim_df$Country <- full_df[,1]
rownames(trim_df) <- full_df[,1]

We now concern ourselves with the cleaning of the selected columns as many have strings mixed in with numeric values.

library(stringr)

# Geography..Area...land
trim_df$Geography..Area...land <- as.numeric(gsub(",", "", str_extract(trim_df$Geography..Area...land, "[0-9,]+")))
colnames(trim_df)[colnames(trim_df) == "Geography..Area...land"] <- "Land.sqkm"

# Geography..Area...total
trim_df$Geography..Area...total <- as.numeric(gsub(",", "", str_extract(trim_df$Geography..Area...total, "[0-9,]+")))
colnames(trim_df)[colnames(trim_df) == "Geography..Area...total"] <- "Total.sqkm"

# Geography..Climate
trim_df$Geography..Climate <- str_extract(trim_df$Geography..Climate, "^[^;]+")
colnames(trim_df)[colnames(trim_df) == "Geography..Climate"] <- "Climate"

# Geography..Land.use...agricultural.land
trim_df$Geography..Land.use...agricultural.land <- as.numeric(sub("%", "", str_extract(trim_df$Geography..Land.use...agricultural.land, "[0-9.]+%")))/100
colnames(trim_df)[colnames(trim_df) == "Geography..Land.use...agricultural.land"] <- "Arable.land.percent"

# Economy..Real.GDP.per.capita
trim_df$Economy..Real.GDP.per.capita <- as.numeric(gsub("[$,]", "", str_extract(trim_df$Economy..Real.GDP.per.capita, "\\$[0-9,]+")))
colnames(trim_df)[colnames(trim_df) == "Economy..Real.GDP.per.capita"] <- "Real.GDP.per.capita.USD"

# People.and.Society..Population.growth.rate
trim_df$People.and.Society..Population.growth.rate <- as.numeric(sub("%", "", str_extract(trim_df$People.and.Society..Population.growth.rate, "[0-9.]+%"))) / 100
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Population.growth.rate"] <- "Population.growth.rate.percent"

# People.and.Society..Birth.rate - births 
trim_df$People.and.Society..Birth.rate <- as.numeric(str_extract(trim_df$People.and.Society..Birth.rate, "^[0-9]+")) / 1000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Birth.rate"] <- "Live.births.per.1000.people"

# People.and.Society..Death.rate - deaths per person 
trim_df$People.and.Society..Death.rate <- as.numeric(str_extract(trim_df$People.and.Society..Death.rate, "^[0-9]+")) / 1000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Death.rate"] <- "Deaths.per.1000.people"

# People.and.Society..Maternal.mortality.ratio - deaths per live birth
trim_df$People.and.Society..Maternal.mortality.ratio <- as.numeric(gsub(",", "", str_extract(trim_df$People.and.Society..Maternal.mortality.ratio, "^[0-9]+"))) / 100000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Maternal.mortality.ratio"] <- "Maternal.deaths.per.100000.live.births"

# People.and.Society..Infant.mortality.rate...total - deaths per live birth
trim_df$People.and.Society..Infant.mortality.rate...total <- as.numeric(str_extract(trim_df$People.and.Society..Infant.mortality.rate...total, "^[0-9.]+")) / 1000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Infant.mortality.rate...total"] <- "Infant.deaths.per.1000.live.births"

# People.and.society..Life.expectancy.at.birth...total.population
trim_df$People.and.Society..Life.expectancy.at.birth...total.population <- as.numeric(str_extract(trim_df$People.and.Society..Life.expectancy.at.birth...total.population, "^[0-9.]+"))
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Life.expectancy.at.birth...total.population"] <- "Life.expectancy.at.birth"

# People.and.Society..Physician.density
trim_df$People.and.Society..Physician.density <- as.numeric(str_extract(trim_df$People.and.Society..Physician.density, "^[0-9.]+")) / 1000
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Physician.density"] <- "Physicians.per.1000.people"

# Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants
trim_df$Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants <- as.numeric(str_extract(trim_df$Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants, "^[0-9]+")) / 100
colnames(trim_df)[colnames(trim_df) == "Communications..Telephones...mobile.cellular...subscriptions.per.100.inhabitants"] <- "Avg.cellphone.subscriptions.per.person"

# Communications..Internet.users...percent.of.population
trim_df$Communications..Internet.users...percent.of.population <- as.numeric(str_extract(trim_df$Communications..Internet.users...percent.of.population, "^[0-9.]+")) / 100
colnames(trim_df)[colnames(trim_df) == "Communications..Internet.users...percent.of.population"] <- "Internet.users.percent.of.population"

# People.and.Society..Population...total
trim_df$People.and.Society..Population...total <- as.numeric(gsub(",", "", str_extract(trim_df$People.and.Society..Population...total, "^[0-9,]+")))
colnames(trim_df)[colnames(trim_df) == "People.and.Society..Population...total"] <- "Total.population"

# Geography..Map.references
colnames(trim_df)[colnames(trim_df) == "Geography..Map.references"] <- "Continent"

# Note, you must re-run the whole notebook to not have issues with the nan overwriting

Note that we have removed entries that may not be countries so our numbers may not reflect those of other surveys. The preprocessing is however, accessible.

The continents we examine are:

continents <- unlist(unique(trim_df$Continent))
continents
## [1] "Asia"                              "Europe"                           
## [3] "Africa"                            "Central America and the Caribbean"
## [5] "South America"                     "Oceania"                          
## [7] "Middle East"                       "North America"                    
## [9] "Southeast Asia"

Individual Analysis

Basic Analysis of Aggregates of Human-Geographic Metrics

Land, Arable Land, Proportion

# Land per continent
continent_land <- trim_df %>%
  group_by(Continent) %>%
  summarise(TotalLand = sum(Land.sqkm, na.rm = TRUE))

  # Arable land per continent
continent_arable_land <- trim_df %>%
  mutate(ArableLand = (Arable.land.percent) * Land.sqkm) %>%
  group_by(Continent) %>%
  summarise(TotalArableLand = sum(ArableLand, na.rm = TRUE))

# Combine into long format
combined <- left_join(continent_land, continent_arable_land, by="Continent") %>%
  pivot_longer(cols = c(TotalLand, TotalArableLand),
               names_to = "Type",
               values_to = "Area") %>%
  mutate(Type = recode(Type,
                       TotalLand = "Total Land",
                       TotalArableLand = "Arable Land"))

# Sort
continent_order <- combined %>%
  filter(Type == "Total Land") %>%
  arrange(desc(Area)) %>%
  pull(Continent)


combined$Continent <- factor(combined$Continent, levels=continent_order)

# Plot - skqm of land and arable land
ggplot(combined, aes(x = Continent, y = Area, fill = Type)) +
  geom_bar(stat = "identity", position = position_dodge(width=0.8), width = 0.7, color = "black") +
  scale_y_continuous(labels= comma) + 
  labs(
    title = "Land vs. Arable Land Area by Region",
    x = "Region", 
    y = "Area (sq. km)",
    fill = "Land Type"
  ) +
  theme_minimal(base_size=14) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  scale_fill_brewer(palette="Set1")

# Compute the proportion of arable land
continent_proportion <- trim_df %>%
  mutate(ArableLand = (Arable.land.percent) * Land.sqkm) %>%
  group_by(Continent) %>%
  summarise(
    TotalLand = sum(Land.sqkm, na.rm = TRUE),
    TotalArableLand = sum(ArableLand, na.rm = TRUE)
  ) %>%
  mutate(Proportion = TotalArableLand / TotalLand)

# Sort by proportion
continent_proportion$Continent <- factor(
  continent_proportion$Continent,
  levels = continent_proportion$Continent[order(-continent_proportion$Proportion)]
)

# Plot - sqkm of arable land ratio
ggplot(continent_proportion, aes(x = Continent, y = Proportion, fill = Continent)) +
  geom_bar(stat = "identity", width = 0.7, color = "black") + 
  scale_y_continuous(labels= percent_format(accuracy = 1)) +
  labs(
    title = "Proportion of Arable Land by Region",
    x = "Region",
    y = "Arable Land (% of Total Land)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle=45, hjust=1),
    legend.position = "none"
  ) +
  scale_fill_brewer(palette = "Paired")

At this point in the analysis we look at a few quality of life indicators including Real GDP per Capita and some other statistics concerning public health, again per region. (These should by put in one side-by-side analysis)

# Function to generate a bar chart for a specific numeric variable
plot_bar_chart <- function(data, variable, group_var = "Continent", agg_fun = mean, palette = "Paired", title = NULL, y_label = NULL, x_label = "Region") {
  
  # Check if the column exists in the dataframe
  if(!(variable %in% colnames(data))) {
    stop(paste("Column", variable, "not found in the dataframe"))
  }
  
  # Create a summarized dataset by the chosen grouping variable
  summary_data <- data %>%
    group_by(.data[[group_var]]) %>%
    summarise(Aggregate = agg_fun(.data[[variable]], na.rm = TRUE)) %>%
    arrange(desc(Aggregate))
  
  # Define title and y-label if not provided
  if (is.null(title)) title <- paste("Average", variable, "by", group_var)
  if (is.null(y_label)) y_label <- paste("Average", variable)
  
  # Plot the bar chart
  p <- ggplot(summary_data, aes(x = reorder(.data[[group_var]], -Aggregate), y = Aggregate, fill = .data[[group_var]])) +
    geom_bar(stat = "identity", width = 0.7, color = "black") +
    scale_y_continuous(labels = scales::comma) + 
    labs(
      title = title,
      x = x_label, 
      y = y_label
    ) +
    theme_minimal(base_size = 14) +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      legend.position = "none") +
    scale_fill_brewer(palette = "Paired") +
    coord_flip()
  return(p)
}


# Function to create interactive boxplot with underlying points
plot_boxplot_with_points <- function(data, variable, group_var = "Continent", 
                                     title = NULL, y_label = NULL, x_label = "Region", 
                                     point_id_var = "Country") {
  data <- na.omit(data)
  # Check if required columns exist
  if (!(variable %in% colnames(data))) stop(paste("Column", variable, "not found in the dataframe"))
  if (!(group_var %in% colnames(data))) stop(paste("Group variable", group_var, "not found in the dataframe"))
  if (!(point_id_var %in% colnames(data))) stop(paste("Hover variable", point_id_var, "not found in the dataframe"))

  # Define default labels
  if (is.null(title)) title <- paste("Distribution of", variable, "by", group_var)
  if (is.null(y_label)) y_label <- variable

  p <- ggplot(data, aes(x = .data[[group_var]], y = .data[[variable]],
        fill=.data[[group_var]],         
        text = paste0(
          point_id_var, ": ", .data[[point_id_var]], "<br>",
          variable, ": ", .data[[variable]]
        ))) +
    geom_boxplot(
      width = 0.75,
      alpha=0.25,
      outlier.color=NA
    ) +
    geom_point(
      aes(colour = .data[[group_var]]),
      size = 1.3,
      alpha = 1,
      position = position_jitter( # obtain shifted points
        seed = 1, 
        width = .09
      )
    ) +
    labs(title = title, x = x_label, y = y_label) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") +
    coord_flip()
  
  return(ggplotly(p, tooltip = "text"))  # Return interactive plot
}

Real GDP per capita per region (2021 USD):

Avg_GDP_Cap_Bar <- plot_bar_chart(trim_df, "Real.GDP.per.capita.USD", title= "Average Real GDP per Capita by Region", y_label="Real GDP per Capita (2021 USD)", x_label="Region")
Box_GDP_Cap <- plot_boxplot_with_points(trim_df, "Real.GDP.per.capita.USD", title= "Average Real GDP per Capita by Region", y_label="Real GDP per Capita (2021 USD)", x_label="Region")

Number of physicians rate, “Physicians.per.1000.people”:

Box_Doc_Rate <- plot_boxplot_with_points(trim_df, "Physicians.per.1000.people", title= "Average No. Physicians per 1000 People", y_label="No. Physicians per 1000 People", x_label="Region")

Infant mortality rate, “Infant.deaths.per.1000.live.births”:

Box_IMR <- plot_boxplot_with_points(trim_df, "Infant.deaths.per.1000.live.births", title= "Average Infant Mortality Rate", y_label="Infant Deaths per 1000 Live Births", x_label="Region")

Maternal mortality rate, “Maternal.deaths.per.100000.live.births”:

Box_MMR <- plot_boxplot_with_points(trim_df, "Maternal.deaths.per.100000.live.births", title= "Average Maternal Mortality Rate", y_label="Maternal Deaths per 100,000 live Births", x_label="Region")

“Life.expectancy.at.birth”:

Box_Life_Expect <- plot_boxplot_with_points(trim_df, "Life.expectancy.at.birth", title= "Average Life expectancy at birth", y_label="Years After Birth", x_label="Region")

Death rate, “Deaths.per.1000.people”:

Box_Death_Rate <- plot_boxplot_with_points(trim_df, "Deaths.per.1000.people", title= "Average Death Rate", y_label="Deaths per 1000 People", x_label="Region")

Birth rate, “Live.births.per.1000.people”:

Box_Birth_Rate <- plot_boxplot_with_points(trim_df, "Live.births.per.1000.people", title= "Average Birth Rate", y_label="Live Births per 1000 people", x_label="Region")

Population Growth Rate”Population.growth.rate.percent”:

Box_Pop_Growth_Rate <- plot_boxplot_with_points(trim_df, "Population.growth.rate.percent", title= "Average Population Growth Rate", y_label="Growth Rate", x_label="Region")

Total population, “Population”:

Total_Pop_Bar <- plot_bar_chart(trim_df, "Total.population", title= "Total Population", y_label="No. of People", x_label="Region", agg_fun=sum)
Box_Total_Pop <- plot_boxplot_with_points(trim_df, "Total.population", title= "Total Population", y_label="No. of People", x_label="Region")

Note that the model of exponential population growth doesn’t totally work here, see Africa and Asia. The failing assumption is likely gender balance.

Average percent of population online, “Internet.users.percent.of.population”:

Box_Online <- plot_boxplot_with_points(trim_df, "Internet.users.percent.of.population", title= "Average Proportion of Population Online", y_label="Proportion of Internet Users", x_label="Region")

Average number of cellphone subscriptions per inhabitants, “Avg.cellphone.subscriptions.per.person”:

Box_Cell_Subs <- plot_boxplot_with_points(trim_df, "Avg.cellphone.subscriptions.per.person", title= "Average Number of Cellphone Subscriptions per Inhabitant", y_label="Subscriptions", x_label="Region")

World Total Population Bar Chart

Total_Pop_Bar

Plotly Boxplots

Box_GDP_Cap
Box_Total_Pop
Box_Pop_Growth_Rate
Box_MMR
Box_IMR
Box_Doc_Rate
Box_Birth_Rate
Box_Death_Rate
Box_Life_Expect
Box_Online
Box_Cell_Subs

Joint Analysis

PCA

Now we will do principal components analysis on the numeric-continuous variables visualized above and visualize the biplot. (code the datapoints by region – maybe cluster if it looks worthwhile)

Aggregate visualized numeric-continuous columns (Note we remove some with the na.omit command)

numeric_df <- trim_df[,c("Total.sqkm", "Arable.land.percent", "Real.GDP.per.capita.USD", "Physicians.per.1000.people", "Infant.deaths.per.1000.live.births", "Maternal.deaths.per.100000.live.births", "Life.expectancy.at.birth", "Deaths.per.1000.people", "Live.births.per.1000.people", "Population.growth.rate.percent", "Total.population", "Internet.users.percent.of.population", "Avg.cellphone.subscriptions.per.person")]

Perform PCA:

pca_result <- prcomp(na.omit(numeric_df), center=TRUE, scale. = TRUE)

summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.5430 1.2301 1.1545 0.92118 0.80601 0.77879 0.70818
## Proportion of Variance 0.4975 0.1164 0.1025 0.06527 0.04997 0.04666 0.03858
## Cumulative Proportion  0.4975 0.6139 0.7164 0.78165 0.83162 0.87828 0.91686
##                            PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     0.59083 0.49590 0.43754 0.41600 0.25634 0.23588
## Proportion of Variance 0.02685 0.01892 0.01473 0.01331 0.00505 0.00428
## Cumulative Proportion  0.94371 0.96263 0.97735 0.99067 0.99572 1.00000
raw_variance <- pca_result$sdev^2

pve <- raw_variance / sum(raw_variance)

cumulative_pve = cumsum(pve)

pca_performance_data <- data.frame(
  PC = factor(1:length(pve), levels = 1:length(pve)),
  RawVariance = raw_variance,
  PVE = pve,
  CumulativePVE = cumulative_pve
)

PCA Variances Plot:

raw_var_plot <- ggplot(pca_performance_data, aes(x=PC, y=RawVariance)) +
  geom_line(aes(group=1), color="red", size=1) +
  geom_point(color="red", size=3)+
  geom_hline(yintercept=1, linetype="dotted", color="black")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
  labs(
    title = "Raw Variance Plot", 
    x = "Principal Components",
    y = "Raw Variance"
  ) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, hjust=1), legend.position="none")
## NULL

Cumulative Proportion of Variance Explained (CPVE) plot:

cpve_plot <- ggplot(pca_performance_data, aes(x = PC, y = CumulativePVE, group=1)) +
  geom_line(color="blue", size=1) +
  geom_point(color="blue", size=3) +
  geom_hline(yintercept=0.9, linetype="dotted", color="black")
  labs(
    title = "Cumulative Proportion of Variance Explained (PVE)",
    x = "Principal Components",
    y = "Cumulative PVE" 
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none")
## NULL
raw_var_plot + cpve_plot

Here we plot the loading vectors of the PCA on the axes of the first and second principal components.

# Create dataframe for loadings
loadings_df <- as.data.frame(pca_result$rotation[, 1:2])
loadings_df$varname <- rownames(loadings_df)

ggplot(loadings_df, aes(x = PC1, y = PC2)) +
  geom_segment(aes(x = 0, y = 0, xend = PC1, yend = PC2), 
               arrow = arrow(length = unit(0.2,"cm")), color = "blue") +
  geom_text_repel(aes(label = varname), size = 3, max.overlaps = 100) +
  labs(title = "PCA Biplot: Loadings Only") +
  theme_minimal()

To interpret the principal components,we say the following:

  1. PC2 appears to contain information regarding country size in both population and land area, large values indicating larger countries. The only “out of place” variable contained in this is the “Deaths.per.1000.people” variable.

  2. PC1 appears to represent the developing status of a nation with larger scores being given to more developed nations.

  3. We can see something of a contrast between the variables of “Physicians.per.1000.people” and “Population.growth.rate.percent” along the line: \(\text{PC}_2 = \frac{1}{2}\text{PC}_1\). From this one may interpret, countries with a higher rate of physicians in the population have a smaller population growth rate.

Here we plot the most extreme observations defined: \[\text{extreme_score}_i = \sqrt{\text{PC}1_i^2 + \text{PC}2_i^2}\]

meta_df <- trim_df[, c("Country", "Continent")]
# Get PCA scores (observations)
scores_df <- as.data.frame(pca_result$x[, 1:2])
scores_df$Country <- rownames(scores_df)

# Add distance from origin in PC1-PC2 space
scores_df <- scores_df %>%
  mutate(Distance = sqrt(PC1^2 + PC2^2))

# Keep top 10 extreme countries - euclidean norm of first two principal components
top_extremes <- scores_df %>%
  arrange(desc(Distance)) %>%
  slice_head(n = 50)


top_extremes <- top_extremes %>%
  left_join(meta_df, by = "Country")

# Plot
ggplot(top_extremes, aes(x = PC1, y = PC2, label = Country, color = Continent)) +
  geom_point(size = 2) +
  geom_text_repel(
    size = 2,
    max.overlaps = Inf,        # allow all to be plotted
    force = 2,                 # increase repulsion strength
    force_pull = 0.1,          # balance between point and label
    box.padding = 0.5,         # more space around text
    point.padding = 0.3,       # space between text and point
    segment.color = "grey50",  # optional connector color
    segment.size = 0.4
  ) +
  geom_hline(yintercept=0, linetype="dotted", color="black") +
  geom_vline(xintercept=0, linetype="dotted", color="black") + 
  labs(title = "Top 50 Most Extreme Countries in PC1–PC2 Space",
       x = "PC1", y = "PC2") +
  theme_minimal() +
  scale_color_brewer(palette = "Paired")

KDE of PC Scores

This is a visualization of the full distribution of scores in the PC1-PC2 plane to complement the above plots. Note that from the underside of the KDE, each country and its respective values for PC1 and PC2 can be obtained.

# Get scores
pc_scores <- as.data.frame(pca_result$x[, 1:2])
colnames(pc_scores) <- c("PC1", "PC2")
pc_scores$Country <- scores_df$Country

# Define a symmetric range around (0,0)
max_abs <- max(abs(pc_scores$PC1), abs(pc_scores$PC2))
grid_limit <- ceiling(max_abs) + 1  # Pad

# 2D KDE
kde <- kde2d(pc_scores$PC1, pc_scores$PC2,
             n=100, lims= c(-grid_limit, grid_limit, -grid_limit, grid_limit))


p <- plot_ly() %>%
  add_surface(
    x = ~kde$x,
    y = ~kde$y,
    z = ~t(kde$z),  # kde2d and plotly disagree here, transpose it!
    showscale = FALSE,
    showlegend=FALSE,
    opacity = 0.8
  ) %>%
  
  add_markers(
    data = pc_scores,
    x = ~PC1,
    y = ~PC2,
    z = rep(min(kde$z), nrow(pc_scores)) - 0.002, #slight offset below the surface
    text = ~paste("Country: ", Country, "<br>PC1:", round(PC1, 2), "<br>PC2:", round(PC2, 2)),
    hoverinfo = "text",
    marker = list(color = 'cyan', size = 4),
    type = "scatter3d",
    mode = "markers"
  ) %>%
  
  layout(
    title = "Interactive 3D KDE of PC1-PC2",
    scene = list(
      xaxis = list(title = "PC1 (x)", range = c(-grid_limit, grid_limit)),
      yaxis = list(title = "PC2 (y)", range = c(-grid_limit, grid_limit)),
      zaxis = list(title = "Density")
    )
  )

p

Multivariate analysis of numeric-continuous variables

Pairs

Here we examine the pairs plot of all continuous variables. From these plots, we can see pairwise relationships between different variables.

numeric_df_normalized <- scale(numeric_df)



colnames(numeric_df_normalized) <- c("Land", "Ag. Land", "GDP/capita", "Doc. Rate", "Inf. Mort.", "Mat. Mort.", "Life Expect.", "Death Rate", "Birth Rate", "Pop. Growth", "Tot. Pop.", "Internet Use", "Cellphones")

# Set graphical parameters to adjust the size of the boxes
old_par <- par(no.readonly = TRUE)  # Save the original plot parameters
par(mar = c(8,8,3,3))  # Modify margins to make the plot larger (top, left, bottom, right)

pairs(numeric_df_normalized,
      labels = colnames(numeric_df_normalized),
      cex.labels = 0.8,            # label size
      font.labels = 2,             # bold
      gap = 1,                   # spacing between plots and labels
      upper.panel = NULL,
      pch = 21, 
      bg = "lightblue")         # optional: simplify upper triangle

par(old_par)

One noticing is that when GDP is slightly increased, the IMR, MMR, Birth Rate, and Population Growth Rate all decrease markedly. The life expectancy and internet use increase markedly. This appears to be a power-law distribution.

Radar plot of each continent

Variables Examined:

  • “Total.sqkm”

  • “Real.GDP.per.capita.USD”

  • “Life.expectancy.at.birth”

  • “Total.population”

Statistics Considered:

  • Minimum

  • Maximum

star_data <- trim_df[, c("Total.sqkm", "Real.GDP.per.capita.USD", "Life.expectancy.at.birth", "Total.population")]
rownames(star_data) <- trim_df$Country

# For continent-wise plotting
agg_star_data <- aggregate(
  star_data, 
  by = list(Continent = trim_df$Continent), 
  FUN = min,
  na.rm = TRUE
)
rownames(agg_star_data) <- agg_star_data$Continent
agg_star_data$Continent <- NULL  # remove redundant column
agg_star_data_scaled <- scale(agg_star_data)
colnames(agg_star_data_scaled) <- c("Sq. km.", "GDP per capita", "Life Expectancy", "Total Pop.")

# For country-wise plotting
star_data_scaled <- scale(star_data)
colnames(star_data_scaled) <- c("Sq. km.", "GDP per capita", "Life Expectancy", "Total Pop.")

Star Plots of Each Country

Star plots of each country using the same variables as the radar plots are available here

Faces plots of each country using the same variables as the radar plots are available here. Note that the names are hard to see due to extreme length. There was no clear workaround so double clicking on them in the plot highlights the name corresponding to the face directly below.